In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
%matplotlib inline
In [84]:
data = pd.read_csv('train.csv')
In [85]:
data.info()
There are missing values in columns Age, Cabin and Embarked. Let's look at the distributions of the data
In [86]:
data.describe()
Out[86]:
In order to train some machine learning algorithm, we need to replace the alphabets in the Embarked column by numbers. We'll replace S, C, and Q by 0, 1, and 2. Similarly, we replace male and female in the Sex column by 1 and 0 repectively. We also drop the Cabin column as we don't use it for training our ML algorithm.
In [87]:
data.replace({'Cabin' : { None : 'Unknown' }}, inplace=True)
data['Cabin_modified']=[i[0].upper() for i in list(data['Cabin'])]
sex = {'male':1,'female':0}
embarked = {'S':0,'C':1,'Q':2}
cabin_class = {'U':0, 'C':1, 'E':2, 'G':3, 'D':4, 'A':5, 'B':6, 'F':7, 'T':8}
data['sex_type'] = data['Sex'].map(sex)
data['embarked_numeric'] = data['Embarked'].map(embarked)
data['Cabin_Class'] = data['Cabin_modified'].map(cabin_class)
There are 2 missing values in the column Embarked and multiple missing values (~700) in the Age column. Let us try to estimate these missing values
In [88]:
sb.distplot(data['embarked_numeric'].dropna())
Out[88]:
Looking at the above distribution, it is clear that most passengers belonged to the class 0. Let us assume that the embarked_numeric value of the missing data is 0
In [89]:
data[data['Embarked'].isnull()]
Out[89]:
In [90]:
data.replace({'embarked_numeric' : { None : 0 }}, inplace=True)
data.drop('Embarked',1,inplace=True)
In [91]:
data.info()
In [92]:
data.describe()
Out[92]:
The age distributions on the basis of sex and embarked numeric value is shown below. We can use this distribution to estimate the values of the missing data
In [93]:
d = data.dropna()
sb.boxplot(x=d.embarked_numeric,y=d.Age,hue=d.Sex)
Out[93]:
In [94]:
sb.distplot(d.Age)
Out[94]:
In [95]:
data['Age'] = data['Age'].groupby([data['sex_type'], data['Pclass'],data['embarked_numeric']]).apply(lambda x: x.fillna(x.median()))
In [96]:
sb.distplot(data.Age)
Out[96]:
In [97]:
data.info()
In [ ]:
y = data[['Survived']]
In [ ]:
X = data[['Pclass','sex_type','embarked_numeric','Parch','SibSp','Age','Fare','Cabin_Class']]
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
parameters = {'criterion':['gini','entropy'],'splitter':['best','random']} abc = DecisionTreeClassifier() clf = GridSearchCV(abc,parameters) clf.fit(X_train,y_train.values.ravel()) print(clf.bestestimator) print(clf.bestscore)
clf.score(X_test,y_test)
In [ ]:
DecisionTree = DecisionTreeClassifier(criterion='entropy',splitter='best')
scores = cross_val_score(DecisionTree, X, y.values.ravel(), cv=5)
print(scores)
print('Mean Accuracy: %f'%(np.mean(scores)))
In [ ]:
parameters = {'max_depth' : [4, 6, 8],
'n_estimators': [50, 10],
'max_features': ['sqrt', 'auto', 'log2'],
'min_samples_split': [2, 3, 10],
'min_samples_leaf': [1, 3, 10],
'bootstrap': [True, False]}
abc = RandomForestClassifier()
clf = GridSearchCV(abc,parameters)
clf.fit(X,y.values.ravel())
print(clf.best_estimator_)
print(clf.best_score_)
In [ ]:
RandomForest = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=6, max_features='sqrt', max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=3,
min_samples_split=3, min_weight_fraction_leaf=0.0,
n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
verbose=0, warm_start=False)
scores = cross_val_score(RandomForest, X, y.values.ravel(), cv=10)
print(scores)
print('Mean Accuracy: %f'%(np.mean(scores)))
In [ ]:
RandomForest.fit(X, y.values.ravel())
In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit
C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) grid.fit(X_train, y_train.ravel)
In [ ]:
SupportVector = SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
scores = cross_val_score(SupportVector, X, y.values.ravel(), cv=10)
print(scores)
print('Mean Accuracy: %f'%(np.mean(scores)))
In [ ]:
SupportVector.fit(X, y.values.ravel())
In [ ]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
In [ ]:
def create_model(optimizer='adam'):
model = Sequential()
model.add(Dense(64, input_shape=(6,)))
model.add(Activation('sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(64))
model.add(Activation('sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=["accuracy"])
return model
In [ ]:
model = KerasClassifier(build_fn=create_model,epochs=100,batch_size=64, verbose=0)
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_results = grid.fit(np.array(X_train),np.array(y_train))
In [ ]:
print("Best: %f using %s" % (grid_results.best_score_, grid_results.best_params_))
In [ ]:
model = KerasClassifier(build_fn=create_model,epochs=100,batch_size=64, verbose=0)
model.fit(np.array(X_train),np.array(y_train))
model.score(np.array(X_test),np.array(y_test))
In [ ]:
model = KerasClassifier(build_fn=create_model,epochs=100,batch_size=64, verbose=0)
scores = cross_val_score(model,np.array(X),np.array(y), cv=10)
print(scores)
print('Mean Accuracy: %f'%(np.mean(scores)))
In [ ]:
model = KerasClassifier(build_fn=create_model,epochs=100,batch_size=64, verbose=0)
model.fit(np.array(X),np.array(y))
In [99]:
data = pd.read_csv('train.csv')
data.replace({'Cabin' : { None : 'Unknown' }}, inplace=True)
data['Cabin_modified']=[i[0].upper() for i in list(data['Cabin'])]
sex = {'male':1,'female':0}
embarked = {'S':0,'C':1,'Q':2}
cabin_class = {'U':0, 'C':1, 'E':2, 'G':3, 'D':4, 'A':5, 'B':6, 'F':7, 'T':8}
data['sex_type'] = data['Sex'].map(sex)
data['embarked_numeric'] = data['Embarked'].map(embarked)
data['Cabin_Class'] = data['Cabin_modified'].map(cabin_class)
data.replace({'embarked_numeric' : { None : 0 }}, inplace=True)
data.drop('Embarked',1,inplace=True)
data['Age'] = data['Age'].groupby([data['sex_type'], data['Pclass'],data['embarked_numeric']]).apply(lambda x: x.fillna(x.median()))
data[data['Fare'].isnull()]
data['Fare'] = data['Fare'].groupby([data['sex_type'], data['Pclass'],data['embarked_numeric']]).apply(lambda x: x.fillna(x.median()))
data['Cabin_Class_U'] = data['Cabin_Class'].map({0:1})
data.replace({'Cabin_Class_U' : { None : 0 }}, inplace=True)
data['Cabin_Class_C'] = data['Cabin_Class'].map({1:1})
data.replace({'Cabin_Class_C' : { None : 0 }}, inplace=True)
data['Cabin_Class_E'] = data['Cabin_Class'].map({2:1})
data.replace({'Cabin_Class_E' : { None : 0 }}, inplace=True)
data['Cabin_Class_G'] = data['Cabin_Class'].map({3:1})
data.replace({'Cabin_Class_G' : { None : 0 }}, inplace=True)
data['Cabin_Class_D'] = data['Cabin_Class'].map({4:1})
data.replace({'Cabin_Class_D' : { None : 0 }}, inplace=True)
data['Cabin_Class_A'] = data['Cabin_Class'].map({5:1})
data.replace({'Cabin_Class_A' : { None : 0 }}, inplace=True)
data['Cabin_Class_B'] = data['Cabin_Class'].map({6:1})
data.replace({'Cabin_Class_B' : { None : 0 }}, inplace=True)
data['Cabin_Class_F'] = data['Cabin_Class'].map({7:1})
data.replace({'Cabin_Class_F' : { None : 0 }}, inplace=True)
data['Cabin_Class_T'] = data['Cabin_Class'].map({8:1})
data.replace({'Cabin_Class_T' : { None : 0 }}, inplace=True)
data['Embarked_S'] = data['embarked_numeric'].map({0:1})
data.replace({'Embarked_S' : { None : 0 }}, inplace=True)
data['Embarked_C'] = data['embarked_numeric'].map({1:1})
data.replace({'Embarked_C' : { None : 0 }}, inplace=True)
data['Embarked_Q'] = data['embarked_numeric'].map({2:1})
data.replace({'Embarked_Q' : { None : 0 }}, inplace=True)
data['Pclass1'] = data['Pclass'].map({1:1})
data.replace({'Pclass1' : { None : 0 }}, inplace=True)
data['Pclass2'] = data['Pclass'].map({2:1})
data.replace({'Pclass2' : { None : 0 }}, inplace=True)
data['Pclass3'] = data['Pclass'].map({3:1})
data.replace({'Pclass3' : { None : 0 }}, inplace=True)
data['Title'] = data['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
Title_Dictionary = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr" : "Mr",
"Mrs" : "Mrs",
"Miss" : "Miss",
"Master" : "Master",
"Lady" : "Royalty"
}
data['Title'] = data.Title.map(Title_Dictionary)
titles_dummies = pd.get_dummies(data['Title'],prefix='Title')
data = pd.concat([data,titles_dummies],axis=1)
In [100]:
X = data[['sex_type','Title_Mr','Fare','Title_Miss','Title_Mrs','Pclass3','Age','SibSp','Pclass1','Parch','Pclass2']]
y = data['Survived']
In [105]:
RandomForest = RandomForestClassifier()
scores = cross_val_score(RandomForest, X, y.values.ravel(), cv=10)
print(scores)
print('Mean Accuracy: %f'%(np.mean(scores)))
In [104]:
features = pd.DataFrame()
features['feature'] = X.columns
features['importance'] = list(RandomForest.feature_importances_)
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
features.plot(kind='barh', figsize=(5, 5))
Out[104]:
In [106]:
parameters = {'max_depth' : [4, 6, 8, 12],
'n_estimators': [100,50],
'max_features': ['sqrt', 'auto', 'log2'],
'min_samples_split': [2, 3, 10],
'min_samples_leaf': [1, 3, 10],
'bootstrap': [True, False]}
abc = RandomForestClassifier()
clf = GridSearchCV(abc,parameters)
clf.fit(X,y.values.ravel())
print(clf.best_estimator_)
print(clf.best_score_)
In [107]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape
Out[107]:
In [109]:
clf.fit(X_new,y)
Out[109]:
In [114]:
s=cross_val_score(clf, X_new, y, cv=10)
print(np.mean(s))
In [78]:
data = pd.read_csv('test.csv')
data.replace({'Cabin' : { None : 'Unknown' }}, inplace=True)
data['Cabin_modified']=[i[0].upper() for i in list(data['Cabin'])]
sex = {'male':1,'female':0}
embarked = {'S':0,'C':1,'Q':2}
cabin_class = {'U':0, 'C':1, 'E':2, 'G':3, 'D':4, 'A':5, 'B':6, 'F':7, 'T':8}
data['sex_type'] = data['Sex'].map(sex)
data['embarked_numeric'] = data['Embarked'].map(embarked)
data['Cabin_Class'] = data['Cabin_modified'].map(cabin_class)
data.replace({'embarked_numeric' : { None : 0 }}, inplace=True)
data.drop('Embarked',1,inplace=True)
data['Age'] = data['Age'].groupby([data['sex_type'], data['Pclass'],data['embarked_numeric']]).apply(lambda x: x.fillna(x.median()))
data[data['Fare'].isnull()]
data['Fare'] = data['Fare'].groupby([data['sex_type'], data['Pclass'],data['embarked_numeric']]).apply(lambda x: x.fillna(x.median()))
data['Cabin_Class_U'] = data['Cabin_Class'].map({0:1})
data.replace({'Cabin_Class_U' : { None : 0 }}, inplace=True)
data['Cabin_Class_C'] = data['Cabin_Class'].map({1:1})
data.replace({'Cabin_Class_C' : { None : 0 }}, inplace=True)
data['Cabin_Class_E'] = data['Cabin_Class'].map({2:1})
data.replace({'Cabin_Class_E' : { None : 0 }}, inplace=True)
data['Cabin_Class_G'] = data['Cabin_Class'].map({3:1})
data.replace({'Cabin_Class_G' : { None : 0 }}, inplace=True)
data['Cabin_Class_D'] = data['Cabin_Class'].map({4:1})
data.replace({'Cabin_Class_D' : { None : 0 }}, inplace=True)
data['Cabin_Class_A'] = data['Cabin_Class'].map({5:1})
data.replace({'Cabin_Class_A' : { None : 0 }}, inplace=True)
data['Cabin_Class_B'] = data['Cabin_Class'].map({6:1})
data.replace({'Cabin_Class_B' : { None : 0 }}, inplace=True)
data['Cabin_Class_F'] = data['Cabin_Class'].map({7:1})
data.replace({'Cabin_Class_F' : { None : 0 }}, inplace=True)
data['Cabin_Class_T'] = data['Cabin_Class'].map({8:1})
data.replace({'Cabin_Class_T' : { None : 0 }}, inplace=True)
data['Embarked_S'] = data['embarked_numeric'].map({0:1})
data.replace({'Embarked_S' : { None : 0 }}, inplace=True)
data['Embarked_C'] = data['embarked_numeric'].map({1:1})
data.replace({'Embarked_C' : { None : 0 }}, inplace=True)
data['Embarked_Q'] = data['embarked_numeric'].map({2:1})
data.replace({'Embarked_Q' : { None : 0 }}, inplace=True)
data['Pclass1'] = data['Pclass'].map({1:1})
data.replace({'Pclass1' : { None : 0 }}, inplace=True)
data['Pclass2'] = data['Pclass'].map({2:1})
data.replace({'Pclass2' : { None : 0 }}, inplace=True)
data['Pclass3'] = data['Pclass'].map({3:1})
data.replace({'Pclass3' : { None : 0 }}, inplace=True)
data['Title'] = data['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
Title_Dictionary = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr" : "Mr",
"Mrs" : "Mrs",
"Miss" : "Miss",
"Master" : "Master",
"Lady" : "Royalty"
}
data['Title'] = data.Title.map(Title_Dictionary)
titles_dummies = pd.get_dummies(data['Title'],prefix='Title')
data = pd.concat([data,titles_dummies],axis=1)
In [79]:
X = data[['sex_type','Title_Mr','Fare','Title_Miss','Title_Mrs','Pclass3','Age','Cabin_Class_U','SibSp','Pclass1','Parch','Pclass2']]
In [80]:
y = RandomForest.predict(np.array(X))
In [2]:
final = pd.DataFrame(data.PassengerId)
final['Survived']=y
final.to_csv('titanic.csv',index=None)
In [1]:
final.shape
In [ ]: